Loading library

1. Reading input and filter column by index

knitr::opts_chunk$set(echo = TRUE)
price_data <- read.delim("prices-and-earnings.txt")
price_data <- price_data[,c(1,2,5,6,7,9,10,16,17,18,19)]

2. Heatmap of data without reorder

price_data_scale <- scale(price_data[,-1])

plot_ly(x=colnames(price_data_scale), y=rownames(price_data_scale), 
        z=price_data_scale, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of price dataset")

Analysis: No its not possible to see clusters, very hard to even see an outlier.

3. Distance matrix computation and ordering by Hierarchical Clustering

price_row_dist <- dist(x=price_data_scale, method = "euclidean", diag = TRUE)
price_col_dist <- dist(x=t(price_data_scale), method = "euclidean", diag = TRUE)
price_row_cor <- cor(x=t(price_data_scale))
price_col_cor <- cor(x=price_data_scale)
price_row_cor <- 1- price_row_cor
price_col_cor <- 1- price_col_cor

order1 <- seriate(price_row_dist, "OLO")
order2 <- seriate(price_col_dist, "OLO")
ord1 <- get_order(order1)
ord2 <- get_order(order2)

order1_cor <- seriate(as.dist(price_row_cor), "OLO")
order2_cor <- seriate(as.dist(price_col_cor), "OLO")
ord1_cor <- get_order(order1_cor)
ord2_cor <- get_order(order2_cor)

reordmatr <- price_data_scale[rev(ord1),ord2]
reordmatr_cor <- price_data_scale[rev(ord1_cor),ord2_cor]

plot_ly(x=colnames(reordmatr), y=rownames(reordmatr), 
        z=reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of price dataset ordered using Optimal leaf ordering")
plot_ly(x=colnames(reordmatr_cor), y=rownames(reordmatr_cor), 
        z=reordmatr_cor, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of correlations of price dataset")

Analysis: Both are graphs are similar to each other. We can see a cluster in the digonals of both the plots.

4. Distance matrix computation and ordering by Traveling Sales Man

order1_tsp <- seriate(price_row_dist, "TSP")
order2_tsp <- seriate(price_col_dist, "TSP")
ord1_tsp <- get_order(order1_tsp)
ord2_tsp <- get_order(order2_tsp)
reordmatr <- price_data_scale[rev(ord1_tsp),ord2_tsp]

plot_ly(x=colnames(reordmatr), y=rownames(reordmatr), 
        z=reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of price dataset ordered using Traveling Sales Man")
## Comparing the optimization
criterion(dist(price_data_scale[rev(ord1_tsp)], method = "euclidean"), method = c("Gradient_raw", "Path_length"))
## Gradient_raw  Path_length 
##  42418.00000     31.77474
criterion(dist(price_data_scale[rev(ord1)], method = "euclidean"), method = c("Gradient_raw", "Path_length"))
## Gradient_raw  Path_length 
##  30318.00000     32.03208

Analysis: The ‘Travelling Sales Man’ approach performs worse than ‘Hierarchical clustering’ in terms of Gradient raw but has a better path length.

5. Parallel Coordinate Plots

setnames(price_data, old = c("Food.Costs...", "iPhone.4S.hr.", "Clothing.Index", "Hours.Worked", 
                             "Wage.Net", "Vacation.Days", "Big.Mac.min.", "Bread.kg.in.min.", 
                             "Rice.kg.in.min.", "Goods.and.Services..."), 
         new=c("FoodCosts", "iPhone4S", "ClothingIndex", "Hours", "NetWage", "VacationDays", 
               "BigMac", "Bread", "Rice", "GoodsServices"))

(price_data[,-1]) %>% plot_ly(type = 'parcoords', 
                             dimensions = list(
                               list(label = "Food Costs", values = ~FoodCosts),
                               list(label = "iPhone 4S", values = ~iPhone4S),
                               list(label = "Clothing Index", values = ~ClothingIndex),
                               list(label = "Hours Worked", values = ~Hours),
                               list(label = "Wage Net", values = ~NetWage),
                               list(label = "Vacation Days", values = ~VacationDays),
                               list(label = "Big Mac", values = ~BigMac),
                               list(label = "Bread", values = ~Bread),
                               list(label = "Rice", values = ~Rice),
                               list(label = "Goods Services", values = ~GoodsServices)))

Analysis:

6. Radar Chart Plots

reordmatr <- as.data.frame(reordmatr)
reordmatr$City = price_data$City

reordmatr_transformed <- reordmatr%>%tidyr::gather(variable, value, -City, factor_key=T)%>%arrange(City)

radar_plot <- reordmatr_transformed %>% ggplot(aes(x=variable, y=value, group=City)) + geom_polygon(fill="blue") + coord_polar() + theme_bw() + facet_wrap(~ City) + theme(axis.text.x = element_text(size = 5))

ggsave("radar_plot.png", width = 40, height = 60, units = "cm")

Analysis:

7. Best tools for Analysis

Analysis: Radar plot was the most effective. Its easy to see clusters and pattern and compare two very classes very well as well as picking outliers.

Assignment 2

1. Scatter and Trellis Plot Hours Per Week versus Age

knitr::opts_chunk$set(echo = TRUE)
adult_data <- read.csv("adult.csv", header = FALSE)
colnames(adult_data) <- c("age", "workclass", "fnlwgt", "educationType", "education", "marital", "occupation", "relationship", "race", "sex", "capitalGain", "capitalLoss", "hours", "nativeCountry", "incomeLevel")

# Scatter Plot
ggplot(data = adult_data, aes(x=age, y=hours, colour = incomeLevel)) + geom_point() + ggtitle("Hours per Week vs. Age")

# Trellis Plot
ggplot(data = adult_data, aes(x=age, y=hours, color = incomeLevel)) + geom_point()+ facet_grid(incomeLevel~.) + ggtitle("Hours per Week vs. Age")

Analysis: People earning less than 50K tend to work for longer hours and longer age. This information was difficult to see in the scatter plot

2. Density and Trellis Plot of Age

ggplot(adult_data, aes(age, colour = incomeLevel, fill = incomeLevel)) + geom_density(alpha = 0.1) + ggtitle("Denisty Plot of Age")

ggplot(data = adult_data, aes(x=age, color = incomeLevel, fill = incomeLevel)) + geom_density(alpha = 0.1) + facet_grid(marital~.) + ggtitle("Trellis Plot of Age vs. Marital Status")

Analysis:

3. 3D scatter plot and Trellis Plot of Education-num vs Age vs Captial Loss

adult_data %>% filter(capitalLoss > 0) %>% plot_ly(x = ~education, y = ~age, z = ~capitalLoss) %>% add_markers() %>% layout(scene = list(xaxis = list(title = 'Education'), yaxis = list(title = 'Age'), zaxis = list(title = 'CapitalLoss')))

Analysis:

4. Trellis Plot of Capital Loss versus Education-num

Analysis:

Apendix

knitr::opts_chunk$set(echo = FALSE)
library(data.table)
library(dplyr)
library(scales)
library(plotly)
library(ggplot2)
library(seriation)

knitr::opts_chunk$set(echo = TRUE)
price_data <- read.delim("prices-and-earnings.txt")
price_data <- price_data[,c(1,2,5,6,7,9,10,16,17,18,19)]
price_data_scale <- scale(price_data[,-1])

plot_ly(x=colnames(price_data_scale), y=rownames(price_data_scale), 
        z=price_data_scale, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of price dataset")

price_row_dist <- dist(x=price_data_scale, method = "euclidean", diag = TRUE)
price_col_dist <- dist(x=t(price_data_scale), method = "euclidean", diag = TRUE)
price_row_cor <- cor(x=t(price_data_scale))
price_col_cor <- cor(x=price_data_scale)
price_row_cor <- 1- price_row_cor
price_col_cor <- 1- price_col_cor

order1 <- seriate(price_row_dist, "OLO")
order2 <- seriate(price_col_dist, "OLO")
ord1 <- get_order(order1)
ord2 <- get_order(order2)

order1_cor <- seriate(as.dist(price_row_cor), "OLO")
order2_cor <- seriate(as.dist(price_col_cor), "OLO")
ord1_cor <- get_order(order1_cor)
ord2_cor <- get_order(order2_cor)

reordmatr <- price_data_scale[rev(ord1),ord2]
reordmatr_cor <- price_data_scale[rev(ord1_cor),ord2_cor]

plot_ly(x=colnames(reordmatr), y=rownames(reordmatr), 
        z=reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of price dataset ordered using Optimal leaf ordering")

plot_ly(x=colnames(reordmatr_cor), y=rownames(reordmatr_cor), 
        z=reordmatr_cor, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of correlations of price dataset")

order1_tsp <- seriate(price_row_dist, "TSP")
order2_tsp <- seriate(price_col_dist, "TSP")
ord1_tsp <- get_order(order1_tsp)
ord2_tsp <- get_order(order2_tsp)
reordmatr <- price_data_scale[rev(ord1_tsp),ord2_tsp]

plot_ly(x=colnames(reordmatr), y=rownames(reordmatr), 
        z=reordmatr, type="heatmap", colors =colorRamp(c("yellow", "red"))) %>% layout(title = "Heatmap of price dataset ordered using Traveling Sales Man")

## Comparing the optimization
criterion(dist(price_data_scale[rev(ord1_tsp)], method = "euclidean"), method = c("Gradient_raw", "Path_length"))
criterion(dist(price_data_scale[rev(ord1)], method = "euclidean"), method = c("Gradient_raw", "Path_length"))

setnames(price_data, old = c("Food.Costs...", "iPhone.4S.hr.", "Clothing.Index", "Hours.Worked", 
                             "Wage.Net", "Vacation.Days", "Big.Mac.min.", "Bread.kg.in.min.", 
                             "Rice.kg.in.min.", "Goods.and.Services..."), 
         new=c("FoodCosts", "iPhone4S", "ClothingIndex", "Hours", "NetWage", "VacationDays", 
               "BigMac", "Bread", "Rice", "GoodsServices"))

(price_data[,-1]) %>% plot_ly(type = 'parcoords', 
                             dimensions = list(
                               list(label = "Food Costs", values = ~FoodCosts),
                               list(label = "iPhone 4S", values = ~iPhone4S),
                               list(label = "Clothing Index", values = ~ClothingIndex),
                               list(label = "Hours Worked", values = ~Hours),
                               list(label = "Wage Net", values = ~NetWage),
                               list(label = "Vacation Days", values = ~VacationDays),
                               list(label = "Big Mac", values = ~BigMac),
                               list(label = "Bread", values = ~Bread),
                               list(label = "Rice", values = ~Rice),
                               list(label = "Goods Services", values = ~GoodsServices)))

reordmatr <- as.data.frame(reordmatr)
reordmatr$City = price_data$City

reordmatr_transformed <- reordmatr%>%tidyr::gather(variable, value, -City, factor_key=T)%>%arrange(City)

radar_plot <- reordmatr_transformed %>% ggplot(aes(x=variable, y=value, group=City)) + geom_polygon(fill="blue") + coord_polar() + theme_bw() + facet_wrap(~ City) + theme(axis.text.x = element_text(size = 5))

ggsave("radar_plot.png", width = 40, height = 60, units = "cm")
knitr::include_graphics("radar_plot.png")
knitr::opts_chunk$set(echo = TRUE)
adult_data <- read.csv("adult.csv", header = FALSE)
colnames(adult_data) <- c("age", "workclass", "fnlwgt", "educationType", "education", "marital", "occupation", "relationship", "race", "sex", "capitalGain", "capitalLoss", "hours", "nativeCountry", "incomeLevel")

# Scatter Plot
ggplot(data = adult_data, aes(x=age, y=hours, colour = incomeLevel)) + geom_point() + ggtitle("Hours per Week vs. Age")

# Trellis Plot
ggplot(data = adult_data, aes(x=age, y=hours, color = incomeLevel)) + geom_point()+ facet_grid(incomeLevel~.) + ggtitle("Hours per Week vs. Age")

ggplot(adult_data, aes(age, colour = incomeLevel, fill = incomeLevel)) + geom_density(alpha = 0.1) + ggtitle("Denisty Plot of Age")

ggplot(data = adult_data, aes(x=age, color = incomeLevel, fill = incomeLevel)) + geom_density(alpha = 0.1) + facet_grid(marital~.) + ggtitle("Trellis Plot of Age vs. Marital Status")

adult_data %>% filter(capitalLoss > 0) %>% plot_ly(x = ~education, y = ~age, z = ~capitalLoss) %>% add_markers() %>% layout(scene = list(xaxis = list(title = 'Education'), yaxis = list(title = 'Age'), zaxis = list(title = 'CapitalLoss')))